{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "85bf9389",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "from gensim.models import Word2Vec\n",
    "from datasets import load_dataset\n",
    "from gensim import utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4d38e659",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset rotten_tomatoes (/home/zhenya/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8530\n"
     ]
    }
   ],
   "source": [
    "train_dataset = load_dataset(\"rotten_tomatoes\", split=\"train\")\n",
    "print(len(train_dataset))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f8951f81",
   "metadata": {},
   "outputs": [],
   "source": [
    "class RottenTomatoesCorpus:\n",
    "\n",
    "    def __init__(self, sentences):\n",
    "        self.sentences = sentences\n",
    "    \n",
    "    def __iter__(self):\n",
    "        for review in self.sentences:\n",
    "            yield utils.simple_preprocess(gensim.parsing.preprocessing.remove_stopwords(review))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b37df2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = train_dataset[\"text\"]\n",
    "corpus = RottenTomatoesCorpus(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ce21a5ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)\n",
    "model.train(corpus_iterable=corpus, total_examples=model.corpus_count, epochs=100)\n",
    "model.save(\"../data/rotten_tomato_word2vec.model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "de1dd6d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('sequels', 0.38357362151145935), ('film', 0.33577531576156616), ('stuffed', 0.2925359606742859), ('quirkily', 0.28789234161376953), ('convict', 0.2810690104961395), ('worse', 0.2789292335510254), ('churn', 0.27702808380126953), ('hellish', 0.27698105573654175), ('hey', 0.27566075325012207), ('happens', 0.27498629689216614)]\n"
     ]
    }
   ],
   "source": [
    "w1 = \"movie\"\n",
    "words = model.wv.most_similar(w1, topn=10)\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e678b165",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0015881418740074113\n"
     ]
    }
   ],
   "source": [
    "(analogy_score, word_list) = model.wv.evaluate_word_analogies('../data/questions-words.txt')\n",
    "print(analogy_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c3864b1b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7401448525607863\n"
     ]
    }
   ],
   "source": [
    "pretrained_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True)\n",
    "(analogy_score, word_list) = pretrained_model.evaluate_word_analogies('../data/questions-words.txt')\n",
    "print(analogy_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e3f7bcf3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'gensim.models.keyedvectors.KeyedVectors'>\n",
      "<class 'gensim.models.word2vec.Word2Vec'>\n"
     ]
    }
   ],
   "source": [
    "print(type(pretrained_model))\n",
    "print(type(model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f78f24e7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
